# -*- coding: utf-8 -*-
"""
Created on Mon Nov 30 20:02:03 2015

@author: liangrongli
"""
# 糗事百科抓取
import urllib
import urllib2
import re

page = 1
url = 'http://www.qiushibaike.com/hot/page/'+str(page)
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0'
headers = {'User-Agent':user_agent}
try:
    request = urllib2.Request(url,headers=headers)
    reponse = urllib2.urlopen(request)
    content = reponse.read().decode('utf-8')
    pattern = re.compile(r'<h2>(.*?)</h2>.*?<div.*?content">(.*?)<!--.*?</div>.*?<span.*?stats-vote">.*?<i.*?number">(.*?)</i>.*?</span>',re.S)
    items = re.findall(pattern,content)
    for item in items:
        print u"【作者】: ",item[0],"\n"
        print u"【内容】： ",item[1].strip(),"\n"
        print u"【评论数】： ",item[2],"\n"
        print "----------------------------------------------------------------"
except urllib2.URLError,e:
    if hasattr(e,"code"):
        print e.code
    if hasattr(e,"reason"):
        print e.reason
        